In this document we present the joint analysis of the PASS1A metabolomics datasets.

Load all datasets

Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.

source("~/Desktop/repos/motrpac-bic-norm-qc/tools/supervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(randomForest) # for classification tests

# Load the dmaqc data
merged_dmaqc_data =  load_from_bucket("merged_dmaqc_data2019-10-15.RData",
    "gs://bic_data_analysis/pass1a/pheno_dmaqc/",F)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min + 
  merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min

# col time vs. control
# df = data.frame(
#   bid = merged_dmaqc_data$bid,
#   edta_col_time = merged_dmaqc_data$calculated.variables.edta_coll_time,
#   time_to_freeze = merged_dmaqc_data$time_to_freeze,
#   is_control = merged_dmaqc_data$animal.key.is_control,
#   tp = merged_dmaqc_data$animal.key.timepoint,
#   tissue = merged_dmaqc_data$specimen.processing.sampletypedescription
# )
# df = unique(df)
# boxplot(edta_col_time/3600 ~ is_control,df)
# boxplot(edta_col_time/3600 - tp ~ is_control,df)
# wilcox.test(edta_col_time/3600 ~ is_control,df)

# blood freeze times
blood_samples = 
  merged_dmaqc_data$specimen.processing.sampletypedescription ==
  "EDTA Plasma"
blood_freeze_time = 
  as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
  as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
time_to_freeze = merged_dmaqc_data$time_to_freeze[blood_samples] = 
  blood_freeze_time[blood_samples]

# Load our parsed metabolomics datasets
metabolomics_bucket_obj = load_from_bucket(
  file = "metabolomics_parsed_datasets_pass1a_external1.RData",
  bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
metabolomics_parsed_datasets = metabolomics_bucket_obj$metabolomics_parsed_datasets

Define the variables to be adjusted for:

biospec_cols = c(
  "acute.test.distance",
  "calculated.variables.time_to_freeze",
  # "calculated.variables.edta_coll_time", # no need - see code above for blood
  "bid" # required for matching datasets
  )
differential_analysis_cols = c(
  "animal.registration.sex",
  "animal.key.timepoint",
  "animal.key.is_control"
)
pipeline_qc_cols = c("sample_order")

Log-transform: effect on variance

Some sites do not use the log transformation on their dataset. In this section we plot the coefficient of variation as a function of the mean instensity. We take a single dataset as an example to show how log-transformed data have reduced dependency and smoother plots.

As an additional analysis we also plot the number of missing values per metabolite as a function of its mean intensity. We show that while there is high correlation some missing values appear in fairely high intensities. This is important for imputation as some sites use some fixed low value instead of knn imputation.

# Plot cv vs means
library(gplots)
d = metabolomics_parsed_datasets[["white_adipose_powder,metab_u_hilicpos,unnamed"]]
dx = d$sample_data
CoV<-function(x){return(sd(x,na.rm = T)/mean(x,na.rm=T))}
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Raw data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")

# Repeat after log2
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Log2 data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")

# Plot number of NAs vs intensity mean
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
num_nas = rowSums(is.na(dx))
df = data.frame(Num_NAs = num_nas[inds],Mean_intensity = dmeans[inds])
rho = cor(df$Num_NAs,df$Mean_intensity,method="spearman")
rho = format(rho,digits=2)
plot(Num_NAs~Mean_intensity,df,cex=0.5,pch=20,
     main=paste("Spearman:",rho))

Load the preprocessed and normalized data

metabolomics_processed_datasets = load_from_bucket(
  file="metabolomics_processed_datasets11182019.RData",
  bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]

# Reduce the metadata to the selected columns
for(currname in names(metabolomics_processed_datasets)){
  curr_data = metabolomics_processed_datasets[[currname]]$normalized_data[[1]]
  # organize the metadata
  curr_meta = merged_dmaqc_data[colnames(curr_data),
        union(biospec_cols,differential_analysis_cols)]
  # remove metadata variables with too many NAs
  na_counts = apply(is.na(curr_meta),2,sum)
  curr_meta = curr_meta[,na_counts/nrow(curr_meta) < 0.1]
  metabolomics_processed_datasets[[currname]]$sample_meta_parsed = curr_meta
}

Log-transform: effect on differential analysis in targeted data

Untargeted data are typically log-transformed and analyzed using linear models. On the other hand, concentration data are sometimes analyzed with the same type of models but using the original data. This raises a problem if we wish to compare exact statistics from these data. In this section we perform residual analysis for single metabolites. Our goal is to identify if concentration data behaves “normally” when not log-transformed. The analysis below examines the residuals of the data after fitting linear models for each metabolite, adjusting for freeze time and sex. We then compare the results with and without the log-transformation, counting the number of metabolites with a significant evidence for non-normally distributed residuals.

# check for normality using the Kolmogorov-Smirnov test
is_normal_test<-function(v){
  if(sd(v,na.rm = T)==0){return(0)}
  try({return(shapiro.test(v)$p.value)})
  # The Shapiro test may fail if the sd of v is zero
  return(ks.test(v,"pnorm",mean(v,na.rm=T),sd(v,na.rm = T))$p.value)
}
# go over the named datasets, get a logged and an unlogged version of
# the data, use these as inputs for the regression
residual_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
  if(!metabolomics_processed_datasets[[nn1]]$is_targeted){next}
  x_log = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
  x_unlog = 2^x_log
  
  # take the covariates, ignore distances
  x_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
  curr_covs = x_meta[,intersect(colnames(x_meta),biospec_cols[2])]
  curr_covs = data.frame(curr_covs,
           sex=x_meta$animal.registration.sex)
  
  # get the lm objects
  curr_models = list()
  for(tp in unique(x_meta$animal.key.timepoint)){
      res_log = apply(
        x_log,1,
        pass1a_simple_differential_abundance,
        tps = x_meta$animal.key.timepoint,tp=tp,
        is_control = x_meta$animal.key.is_control,
        covs = curr_covs,return_model=T
      )
      res_unlog = apply(
        x_unlog,1,
        pass1a_simple_differential_abundance,
        tps = x_meta$animal.key.timepoint,tp=tp,
        is_control = x_meta$animal.key.is_control,
        covs = curr_covs,return_model=T
      )
      is_norm = cbind(
        sapply(res_log,function(x)is_normal_test(residuals(x))),
        sapply(res_unlog,function(x)is_normal_test(residuals(x)))
      )
      colnames(is_norm) = c("log","not log")
      curr_models[[as.character(tp)]] = is_norm
      
      # # test a specific model
      # ind = 246
      # plot(x_unlog[ind,],x_log[ind,])
      # 
      # resids_log = rstandard(res_log[[ind]])
      # fit_log = res_log[[ind]]$fitted.values
      # resids_unlog = rstandard(res_unlog[[ind]])
      # fit_unlog = res_unlog[[ind]]$fitted.values
      # 
      # plot(fit_log,resids_log)
      # plot(fit_unlog,resids_unlog)
  }
  residual_analysis_results[[nn1]] = curr_models
}

# Is there a significant difference between the two options?
log_vs_unlog_summ_mat = sapply(residual_analysis_results,
    function(x)sapply(x,
        function(y)
          wilcox.test(y[,1],y[,2],paired = T,alternative = "g")$p.value))

# Count the number of non-normal metabolites
num_nonnormal_log = sapply(residual_analysis_results,
    function(x)sapply(x,
        function(y)sum(y[,1]<0.05)))
num_nonnormal_log = 
  num_nonnormal_log[,order(colnames(num_nonnormal_log))]
num_nonnormal_unlog = sapply(residual_analysis_results,
    function(x)sapply(x,
        function(y)sum(y[,2]<0.05)))
num_nonnormal_unlog = 
  num_nonnormal_unlog[,order(colnames(num_nonnormal_unlog))]

library(corrplot)
par(mar = c(5,5,5,10))
normdiffs = t(num_nonnormal_log)- t(num_nonnormal_unlog)
corrplot(normdiffs,is.corr = F,tl.cex = 0.7)

Trageted vs. Untargeted: single metabolite comparison

Compute statistics for each dataset

Compare overlaps, effect sizes, and correlations within tissues. Compare targeted-untargeted pairs only. For differential analysis we use the same model as in the analysis above.

# Transform the data matrix to have compound names as row names.
# This requires removing rows without names and changing the rownames of
# the input matrix x.
# Also, do not assume that the row annotation orde fits the data necessarily
extract_by_comp_name_from_row_annot<-function(x,row_annot_x){
  # get the column that has the row names of x or at least
  # have the greatest intersection with it
  int_sizes = apply(row_annot_x,2,function(x,y)length(intersect(x,y)),y=rownames(x))
  ind = which(int_sizes==max(int_sizes,na.rm = T))[1]
  # update the annotation table to have only rows that have 
  # a row in x and then update the rownames to be from the 
  # selected column
  row_annot_x = row_annot_x[is.element(row_annot_x[,ind],set=rownames(x)),]
  rownames(row_annot_x) = row_annot_x[,ind]
  # we can now intersect x and the annotation using their row names
  # and update x accordingly
  shared = intersect(rownames(row_annot_x),rownames(x))
  x = x[shared,]
  row_annot_x = row_annot_x[shared,]
  rownames(x) = row_annot_x$motrpac_comp_name
  return(x)
}

tar_vs_untar_norm_pairs = list(
  c("log2,imp","log2,imp,TMM"),
  c("log2,imp","log2,imp"),
  c("log2,imp","med,log,imp"),
  c("imp,none","imp,none"),
  c("imp,none","imp,TMM"),
  c("imp,none","med,log,imp")
)

# The loop below is the core of the computations for comparing 
# targeted and untargeted data when using known compound names
#
# For each normalization option (a pair from the list above) we compute
# all pairwise correlation matrices of the overlapping metabolites and
# the differential analysis results (again of the overlap).
# These objects are then used later for other quantitative comparisons
norm_method2comparison_results = list()
for(norm_pairs in tar_vs_untar_norm_pairs){
  tar_norm_method = norm_pairs[1]
  untar_norm_method = norm_pairs[2]
  single_metabolite_corrs = list()
  single_metabolite_de = c()
  named2covered_shared_metabolites = list()
  for(nn1 in names(metabolomics_processed_datasets)){
    nn1_tissue = metabolomics_processed_datasets[[nn1]]$tissue
    if(!metabolomics_processed_datasets[[nn1]]$is_targeted){next}
    single_metabolite_corrs[[nn1]] = list()
    named2covered_shared_metabolites[[nn1]] = NULL
    for(nn2 in names(metabolomics_processed_datasets)){
      if(nn2 == nn1){next}
      if(metabolomics_processed_datasets[[nn2]]$is_targeted){next}
      nn2_tissue = metabolomics_processed_datasets[[nn2]]$tissue
      nn2_dataset = paste(strsplit(nn2,split=",")[[1]][3:4],collapse = ",")
      if(nn1_tissue!=nn2_tissue){next}
      # get the numeric datasets and their annotation
      x = metabolomics_processed_datasets[[nn1]]$normalized_data[[tar_norm_method]]
      y = metabolomics_processed_datasets[[nn2]]$normalized_data[[untar_norm_method]]
      row_annot_x = metabolomics_processed_datasets[[nn1]]$row_annot
      row_annot_y = metabolomics_processed_datasets[[nn2]]$row_annot
      # transform metabolite names to the motrpac comp name
      x = extract_by_comp_name_from_row_annot(x,row_annot_x)
      y = extract_by_comp_name_from_row_annot(y,row_annot_y)
      # align the sample sets
      bid_y = merged_dmaqc_data[colnames(y),"bid"]
      bid_x = merged_dmaqc_data[colnames(x),"bid"]    
      # step 1: merge samples from the same BID
      if(length(unique(bid_x))!=length(bid_x)){
        x = aggregate_repeated_samples(x,bid_x)
      }
      else{
        colnames(x) = bid_x
      }
      if(length(unique(bid_y))!=length(bid_y)){
        y = aggregate_repeated_samples(y,bid_y)
      }else{
        colnames(y) = bid_y
      }
      # step 2: use the shared bio ids
      shared_bids = as.character(intersect(colnames(y),colnames(x)))
      x = as.matrix(x[,shared_bids])
      y = as.matrix(y[,shared_bids])
      # At this point x and y are over the same BIDs, now we add the metadata
      y_meta = 
        unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
      rownames(y_meta) = y_meta$bid
      y_meta = y_meta[shared_bids,]
      
      # If one dataset is log-transformed and the other is not
      # transform back to the original values
      is_tar_log = grepl("log",tar_norm_method)
      is_untar_log = grepl("log",untar_norm_method)
      if(is_tar_log && !is_untar_log){
        x = 2^x
      }
      if(!is_tar_log && is_untar_log){
        y = 2^y
      }
    
      # If data are not log-transformed then scale the rows
      if(!is_tar_log || !is_untar_log){
        x = t(scale(t(x),center = T,scale = T))
        y = t(scale(t(y),center = T,scale = T))
      }
    
      # get the shared matebolites
      shared_metabolites = intersect(rownames(x),rownames(y))
      shared_metabolites = na.omit(shared_metabolites)
      if(length(shared_metabolites)==0){next}
      named2covered_shared_metabolites[[nn1]] = union(
        named2covered_shared_metabolites[[nn1]],
        shared_metabolites
      )
    
      # Compute the correlation matrices of the shared metabolites
      if(length(shared_metabolites)>1){
          corrs =cor(t(x[shared_metabolites,]),
                t(y[shared_metabolites,]),method = "spearman")
      }
      else{
          corrs = cor(x[shared_metabolites,],
                y[shared_metabolites,],method = "spearman")
      }
    
      # take the covariates (ignore distances)
      curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
      curr_covs = data.frame(y_meta[,curr_cov_cols])
      names(curr_covs) = curr_cov_cols
      curr_covs$sex = y_meta$animal.registration.sex # add sex
    
      # differential analysis
      for(tp in unique(y_meta$animal.key.timepoint)){
        curr_control_tp = NULL
        # if(tp == 7 || tp == 4){curr_control_tp=7}
        resx = t(apply(
          matrix(x[shared_metabolites,],nrow=length(shared_metabolites)),1,
          pass1a_simple_differential_abundance,
          tps = y_meta$animal.key.timepoint,tp=tp,
          is_control = y_meta$animal.key.is_control,
          covs = curr_covs,return_model=F,
          control_tp = curr_control_tp
        ))
        resy = t(apply(
          matrix(y[shared_metabolites,],nrow=length(shared_metabolites)),1,
          pass1a_simple_differential_abundance,
          tps = y_meta$animal.key.timepoint,tp=tp,
          is_control = y_meta$animal.key.is_control,
          covs = curr_covs,return_model=F,
          control_tp = curr_control_tp
        ))
        # Add dataset information, time point, tissue
        # These are important annotations for our summary matrix
        # called single_metabolite_de below
        added_columns = matrix(cbind(
          rep(nn1,length(shared_metabolites)),
          rep(nn2,length(shared_metabolites)),
          shared_metabolites,
          rep(tp,length(shared_metabolites)),
          rep(nn1_tissue,length(shared_metabolites))
        ),nrow=length(shared_metabolites))
        resx = cbind(resx,rep(T,nrow(resx)))
        colnames(resx)[ncol(resx)] = "is_targeted"
        resy = cbind(resy,rep(F,nrow(resy)))
        colnames(resy)[ncol(resy)] = "is_targeted"
        if(nrow(resx)>1){
          resx = cbind(added_columns[,-2],resx)
          resy = cbind(added_columns[,-1],resy)
        }
        else{
          resx = c(added_columns[,-2],resx)
          resy = c(added_columns[,-1],resy)
       }
        single_metabolite_de = rbind(single_metabolite_de,resx)
        single_metabolite_de = rbind(single_metabolite_de,resy)
      }
      single_metabolite_corrs[[nn1]][[nn2]] = corrs
    }
  }

  # Reformat the differential analysis results for easier comparison later
  single_metabolite_de = data.frame(single_metabolite_de)
  names(single_metabolite_de) = c("dataset","metabolite","tp","tissue",
    "Est","Std","Tstat","Pvalue","is_targeted")
  for(col in names(single_metabolite_de)[-c(1:4)]){
    single_metabolite_de[[col]] = as.numeric(
      as.character(single_metabolite_de[[col]]))
  }
  for(col in names(single_metabolite_de)[1:4]){
    single_metabolite_de[[col]] = 
      as.character(single_metabolite_de[[col]])
  }
  # Remove duplications
  # Rounding the p-values - necessary for removing duplicates
  # using the unique function
  rownames(single_metabolite_de) = NULL
  for(nn in names(single_metabolite_de)){
    ndig = 5
    if(grepl("pval",nn,ignore.case = T)){
      ndig = 10
    }
    if(is.numeric(single_metabolite_de[[nn]])){
      single_metabolite_de[[nn]] = 
      round(single_metabolite_de[[nn]],digits = ndig)
    }
  }
  single_metabolite_de = unique(single_metabolite_de)
  
  # Finally, store all the results for the current normalization pair
  pair_name = paste(tar_norm_method,untar_norm_method,sep=";")
  norm_method2comparison_results[[pair_name]] = list(
    single_metabolite_de = single_metabolite_de,
    single_metabolite_corrs = single_metabolite_corrs,
    named2covered_shared_metabolites = named2covered_shared_metabolites
  )
}

We next transform the data above into tables that contain data for each combination of metabolite, time point, and tissue. These are then used for different meta-analyses: (1) a simple random effects analysis, (2) random effects with a binary covariate indicating if a dataset is targeted or untargeted, (3) redo the RE model of (1) with the targeted data only, and (4) redo the RE model of (1) with the untargeted data only.

library(metafor)
for(pair_name in names(norm_method2comparison_results)){
  meta_analysis_stats = list()
  single_metabolite_de = 
    norm_method2comparison_results[[pair_name]]$single_metabolite_de
  for(tissue in unique(single_metabolite_de$tissue)){
    for(tp in unique(single_metabolite_de$tp)){
      curr_subset = single_metabolite_de[
        single_metabolite_de$tissue==tissue &
          single_metabolite_de$tp==tp,]
      for(metabolite in unique(curr_subset$metabolite)){
        curr_met_data = curr_subset[
          curr_subset$metabolite==metabolite,]
        curr_met_data$var = curr_met_data$Std^2
        re_model1 = NULL;re_model2=NULL
        re_model_tar = NULL;re_model_untar = NULL
        try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var,method="FE")})
        try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
                           mods=curr_met_data$is_targeted,method="FE")})
        try({re_model_tar = rma.uni(
          curr_met_data[curr_met_data$is_targeted==1,"Est"],
          curr_met_data[curr_met_data$is_targeted==1,"var"],
          method="FE"
        )})
        try({re_model_untar = rma.uni(
          curr_met_data[curr_met_data$is_targeted==0,"Est"],
          curr_met_data[curr_met_data$is_targeted==0,"var"],
          method="FE"
        )})
        meta_analysis_stats[[paste(metabolite,tissue,tp,sep=",")]] = 
          list(curr_met_data=curr_met_data,re_model1=re_model1,
            re_model2 = re_model2,re_model_tar=re_model_tar,
            re_model_untar = re_model_untar)
      }
    }
  }
  norm_method2comparison_results[[pair_name]]$meta_analysis_stats =
    meta_analysis_stats
}

Targeted vs. Untargeted: compound overlap

We first plot the number and percentage of metabolites in the targeted datasets that are measured in at least one untargeted dataset.

library(ggplot2)
dataset2num_metabolites = sapply(metabolomics_processed_datasets, 
                                 function(x)nrow(x$sample_data))
named_dataset_coverage = sapply(named2covered_shared_metabolites,length)
named_dataset_coverage = data.frame(
  name = names(named_dataset_coverage),
  percentage = named_dataset_coverage /
  dataset2num_metabolites[names(named_dataset_coverage)],
  count = named_dataset_coverage,
  total = dataset2num_metabolites[names(named_dataset_coverage)]
)
# add datasets with no coverage
all_targeted_datasets = names(metabolomics_processed_datasets)
all_targeted_datasets = all_targeted_datasets[!grepl("untar",all_targeted_datasets)]
zero_coverage_datasets = setdiff(all_targeted_datasets,
                                 named_dataset_coverage$name)
zero_coverage_datasets = data.frame(
  name = zero_coverage_datasets,
  percentage = rep(0,length(zero_coverage_datasets)),
  count = rep(0,length(zero_coverage_datasets)),
  total = dataset2num_metabolites[zero_coverage_datasets]
)
named_dataset_coverage = rbind(named_dataset_coverage,
                           zero_coverage_datasets)
named_dataset_coverage = 
  named_dataset_coverage[order(as.character(named_dataset_coverage$name)),]
print(ggplot(named_dataset_coverage, aes(x=name, y=percentage)) + 
  geom_bar(stat = "identity",width=0.2) + coord_flip() +
  geom_text(data=named_dataset_coverage, 
            aes(name, percentage+0.05, label=count), 
            position = position_dodge(width=0.9),
            size=4) + 
  ggtitle("Targeted dataset: coverage by untargeted"))

Comparison results: normalization methods

Spearman correlations

Compare the normalization methods by their correlation distributions.

rep_correlations = c()
tissues = unique(sapply(metabolomics_processed_datasets,function(x)x$tissue))
for(pair_name in names(norm_method2comparison_results)){
  single_metabolite_corrs = 
    norm_method2comparison_results[[pair_name]]$single_metabolite_corrs
  for(tissue in tissues){
    curr_datasets = names(single_metabolite_corrs)[
      grepl(tissue,names(single_metabolite_corrs))
    ]
    between_corrs = c()
    for(tar_dataset in curr_datasets){
      l = single_metabolite_corrs[[tar_dataset]]
      between_corrs = c(between_corrs,unname(unlist(sapply(l,diag))))
    }
    rep_correlations = rbind(rep_correlations,
          c(pair_name,tissue,mean(between_corrs,na.rm=T),sd(between_corrs,na.rm=T)))
  }
}
rep_correlations = rep_correlations[!is.na(rep_correlations[,3]),]
rep_correlations = data.frame(
  "NormMethod" = rep_correlations[,1],
  "Tissue" = rep_correlations[,2],
  "Mean" = as.numeric(rep_correlations[,3]),
  "SD" = as.numeric(rep_correlations[,4])
)
rep_correlations = rep_correlations[order(rep_correlations$Tissue),]
print(
    ggplot(rep_correlations, aes(x=Tissue, y=Mean, fill=NormMethod)) +
      geom_bar(position=position_dodge(), stat="identity", colour='black') +
      geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD),na.rm=T, 
                   width=.2,position=position_dodge(.9))
)

Differential analysis results

pthr = 0.001
for(tissue in tissues){
  for(pair_name in names(norm_method2comparison_results)){
      meta_analysis_stats = 
          norm_method2comparison_results[[pair_name]]$meta_analysis_stats
      meta_analysis_stats = meta_analysis_stats[
        grepl(tissue,names(meta_analysis_stats))
      ]
      if(length(meta_analysis_stats)==0){next}
      naive_analysis_tar = sapply(meta_analysis_stats,
        function(x)sum(x$curr_met_data[,"Pvalue"]<pthr &
                     x$curr_met_data[,"is_targeted"]))
      naive_analysis_untar = sapply(meta_analysis_stats,
        function(x)sum(x$curr_met_data[,"Pvalue"]<pthr &
                     !x$curr_met_data[,"is_targeted"]))
      tb = table(naive_analysis_tar,naive_analysis_untar)
      nonsig_in_both = tb[1,1]
      tb[1,1] = 0
      barplot(t(tb),
              legend=T,xlab="Number of targeted datasets with p<0.001",
              ylab = "log2 number of metabolites",
              main = paste(tissue,pair_name))
  }
}

Meta-analysis results

library(VennDiagram)
require(gridExtra)
pthr = 0.001
tissues = unique(sapply(metabolomics_processed_datasets,function(x)x$tissue))
tar_vs_untar_correlations = c()
for(tissue in tissues){
  for(pair_name in names(norm_method2comparison_results)){
    meta_analysis_stats = 
        norm_method2comparison_results[[pair_name]]$meta_analysis_stats
    meta_analysis_stats  = meta_analysis_stats[
        grepl(tissue,names(meta_analysis_stats))
      ]
    if(is.null(meta_analysis_stats) || length(meta_analysis_stats)==0){next}
  
    # P-value for the difference between targeted and untargeted
    targeted_diff_p = 
    sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])

   # P-values - targeted vs. untargeted
    pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
    pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
    pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
    significant_in = rep("None",length(pvals_untar))
    significant_in[pvals_tar<pthr] = "Targeted"
    significant_in[pvals_untar<pthr] = "Untargeted"
    significant_in[pvals_tar<pthr & pvals_untar<pthr] = "Both"
    significant_diff = targeted_diff_p<pthr
    rho = cor(-log10(pvals_tar),-log10(pvals_untar),method = "pearson")
    # Betas - targeted vs. untargeted
    betas_tar = 
      sapply(meta_analysis_stats,function(x)x$re_model_tar$beta[1,1])
    betas_untar = 
      sapply(meta_analysis_stats,function(x)x$re_model_untar$beta[1,1])
    betas_untar = unlist(betas_untar[sapply(betas_untar,length)>0])
    df = data.frame(
      targeted = betas_tar,
      untargeted = betas_untar,
      significant_in = significant_in,
      significant_diff = significant_diff
    )
    rho_beta = cor(betas_untar,betas_tar,method = "pearson")
    rhop = cor.test(betas_untar,betas_tar,method = "pearson")$p.value
    print(
      ggplot(df, aes(x=targeted, y=untargeted,
                 shape=significant_diff, color=significant_in)) +
      geom_point() + geom_abline(slope=1,intercept = 0) + 
      ggtitle(paste(tissue, pair_name,
                    "effects, rho=:",format(rho_beta,digits=2)))
    )
  
    tar_vs_untar_correlations = rbind(tar_vs_untar_correlations,
                                      c(tissue,pair_name,rho,rho_beta))
   # draw a venn diagram
    inter_area = sum(significant_in=="Both")
    tar_area = sum(significant_in=="Targeted") + inter_area
    untar_area = sum(significant_in=="Untargeted") + inter_area
    subt = paste("Not significant in both:",table(significant_in)["None"])
    
    venng = draw.pairwise.venn(tar_area,untar_area,inter_area,
            c("Targeted","Untargeted"),lty = rep("blank",2), 
            fill = c("pink", "cyan"), alpha = rep(0.5, 2),
            cat.dist = rep(0.01, 2),ind=F)
    # grid.newpage()
    grid.arrange(gTree(children=venng), 
                 top=paste(tissue,pair_name), bottom=subt)
  }
}

Comparison results: detailed analysis of the selected normalization

Spearman correlations

We examine the average correlation between the platforms (within tissues). Whenever two platforms share more than a single metabolite we plot both the average correlation between the same metabolites and between other metabolites. Adding the average correlation between platforms but with different metabolites is important as it gives some perspective to what a significant correlation is. That is, in many cases below, the average correlation may be greater than expected.

# Next examine the Spearman correlations between platforms
mean_abs<-function(x,...){return(mean(abs(x),...))}
sd_abs<-function(x,...){return(sd(abs(x),...))}
extract_diag_vs_non_diag<-function(corrs,func=mean,...){
  if(length(corrs)==1){
    return(c(same=func(corrs,...),other=NA))
  }
  same = func(diag(corrs),...)
  other = func(
    c(corrs[lower.tri(corrs,diag = F)]),...)
  return(c(same=same,other=other))
}

single_metabolite_corrs =
  norm_method2comparison_results$`log2,imp;log2,imp`$single_metabolite_corrs
for(tar_dataset in names(single_metabolite_corrs)){
  l = single_metabolite_corrs[[tar_dataset]]
  if(length(l)==0){next}
  corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
  corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd)))
  # shorten the row names
  rownames(corr_info) = sapply(rownames(corr_info),
      function(x)paste(strsplit(x,split=",")[[1]][3:4],collapse=","))
  rownames(corr_sd) = rownames(corr_info)
  corr_info$dataset = rownames(corr_info)
  corr_sd$dataset = corr_info$dataset
  corr_info = melt(corr_info)
  corr_sd = melt(corr_sd)
  corr_info$sd = corr_sd$value
  print(
    ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
      geom_bar(position=position_dodge(), stat="identity", colour='black') +
      geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T, 
                   width=.2,position=position_dodge(.9)) +
    ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
      labs(fill = "Pair type") + 
      theme(legend.position="top",legend.direction = "horizontal")
  )
}

Plot selected examples

Here are the results for lactate in plasma.

Example from a log2 dataset:

meta_analysis_stats = 
  norm_method2comparison_results$`log2,imp;med,log,imp`$meta_analysis_stats
lact_res = meta_analysis_stats[
  grepl("lact",names(meta_analysis_stats),ignore.case = T) &
    grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
            function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
  curr_labels = gsub("plasma,","",
                     meta_analysis_stats[[lact_example]][[1]][,1])
  forest(meta_analysis_stats[[lact_example]]$re_model1,
       slab = curr_labels,
       main = lact_example,xlab = "Log fc",
       col = "blue",cex = 1.1)
}

Example from a scaled dataset:

meta_analysis_stats = 
  norm_method2comparison_results$`imp,none;imp,none`$meta_analysis_stats
lact_res = meta_analysis_stats[
  grepl("lact",names(meta_analysis_stats),ignore.case = T) &
    grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
            function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
  curr_labels = gsub("plasma,","",
                     meta_analysis_stats[[lact_example]][[1]][,1])
  forest(meta_analysis_stats[[lact_example]]$re_model1,
       slab = curr_labels,
       main = lact_example,xlab = "Log fc",
       col = "blue",cex = 1.1)
}

We can now check the same analysis for liver:

lact_res = meta_analysis_stats[
  grepl("lact",names(meta_analysis_stats),ignore.case = T) &
    grepl("liver",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
            function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
  curr_labels = gsub("liver_powder,","",
                     meta_analysis_stats[[lact_example]][[1]][,1])
  forest(meta_analysis_stats[[lact_example]]$re_model1,
       slab = curr_labels,
       main = lact_example,xlab = "Log fc",
       col = "blue",cex = 1.1)
}

From the plots above we take the most extreme examples and examine their forest plots.

meta_analysis_stats = 
  norm_method2comparison_results$`imp,none;imp,none`$meta_analysis_stats

# P-value for the difference between targeted and untargeted
targeted_diff_p = 
  sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])

# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])


agree_example = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
                                     targeted_diff_p > 0.1))[1])
simplify_labels_for_forest<-function(s){
  s = gsub(",untargeted","",s)
  tissue = strsplit(s,split=",")[[1]][1]
  s = gsub(paste(tissue,",",sep=""),"",s)
  return(s)
}
forest(meta_analysis_stats[[agree_example]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[agree_example]][[1]][,1]),
  main = paste(agree_example,"significant in both, tar and untar agree",sep="\n"),
  xlab = "Log fc",col = "blue")

agree_p_disagree_beta = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
                                     targeted_diff_p < 0.001))[1])
forest(meta_analysis_stats[[agree_p_disagree_beta]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[agree_p_disagree_beta]][[1]][,1]),
  main = paste(agree_p_disagree_beta,
               "significant in both, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

disagree_example1 = names(sample(which(pvals_tar< 1e-10 & pvals_untar >0.1))[1])
forest(meta_analysis_stats[[disagree_example1]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[disagree_example1]][[1]][,1]),
  main = paste(disagree_example1,
               "significant targeted, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

disagree_example2 = names(sample(which(pvals_tar > 0.1 & pvals_untar < 1e-20))[1])
forest(meta_analysis_stats[[disagree_example2]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[disagree_example2]][[1]][,1]),
  main = paste(disagree_example2,
               "significant in untargeted, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

Targeted vs. untargeted: comparison as a prediction task

Use 5-fold cross validation for analysis within tissues. For each pair of targeted and untargeted datasets from the same tissue, we use the untargeted data as the predictive features and all metabolites in the targeted datasets as the dependent variables. The code below uses feature selection and random forests to train the predictive models.

nfolds = 5
prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
  nn1_tissue = strsplit(nn1,split=",")[[1]][1]
  nn1_tissue = gsub("_powder","",nn1_tissue)
  if(grepl("untargeted",nn1)){next}
  for(nn2 in names(metabolomics_processed_datasets)){
    if(nn2 == nn1){next}
    if(!grepl("untargeted",nn2)){next}
    nn2_tissue = strsplit(nn2,split=",")[[1]][1]
    nn2_tissue = gsub("_powder","",nn2_tissue)
    nn2_dataset = strsplit(nn2,split=",")[[1]][2]
    if(nn1_tissue!=nn2_tissue){next}
    print(paste("features from:",nn2))
    print(paste("labels from:",nn1))
    # get the numeric datasets and their annotation
    y = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
    x = metabolomics_processed_datasets[[nn2]]$normalized_data[[1]]
    # align the sample sets
    bid_y = merged_dmaqc_data[colnames(y),"bid"]
    bid_x = merged_dmaqc_data[colnames(x),"bid"]    
    # step 1: merge samples from the same BID
    if(length(unique(bid_x))!=length(bid_x)){
      x = aggregate_repeated_samples(x,bid_x)
    }
    else{
      colnames(x) = bid_x
    }
    if(length(unique(bid_y))!=length(bid_y)){
      y = aggregate_repeated_samples(y,bid_y)
    }else{
      colnames(y) = bid_y
    }
    # step 2: use the shared bio ids
    shared_bids = as.character(intersect(colnames(y),colnames(x)))
    x = t(as.matrix(x[,shared_bids]))
    y = t(as.matrix(y[,shared_bids]))
    # At this point x and y are over the same BIDs, now we add the metadata
    y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
    rownames(y_meta) = y_meta$bid
    y_meta = y_meta[shared_bids,]
    
    # take the covariates (ignore distances)
    curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
    curr_covs = data.frame(y_meta[,curr_cov_cols])
    names(curr_covs) = curr_cov_cols
    curr_covs$sex = y_meta$animal.registration.sex # add sex
    # add the covariates into x
    x = cbind(x,curr_covs)
    
    # Run the regressions
    folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
    numFeatures = min(ncol(x),2000)
    preds = c();real=c()
    for(i in 1:ncol(y)){
      if( i %% 10 == 0){print(paste("analyzing metabolite number:",i))}
      y_i = y[,1]
      i_preds = c();i_real=c()
      for(j in 1:nfolds){
        tr_x = x[folds!=j,]
        tr_yi = y_i[folds!=j]
        te_x = x[folds==j,]
        te_y = y_i[folds==j]
        # random forest
        # model = randomForest(tr_yi,x=tr_x,ntree = 20)
        # te_preds = predict(model,newdata = te_x)
        model = feature_selection_wrapper(tr_x,tr_yi,
                   coeff_of_var,randomForest,
                   topK = numFeatures,ntree=50)
        te_preds = predict(model,newdata = te_x)
        i_preds = c(i_preds,te_preds)
        i_real = c(i_real,te_y)
      }
      preds = cbind(preds,i_preds)
      real = cbind(real,i_real)
    }
    colnames(preds) = colnames(y)
    colnames(reals) = colnames(y)
    currname = paste(nn1,nn2,sep=";")
    prediction_analysis_results[[currname]] = list(
      preds = preds,real=real
    )
  }
}
save_to_bucket(prediction_analysis_results,
               file="tar_vs_untar_prediction_analysis_results.RData",
               bucket = "gs://bic_data_analysis/pass1a/metabolomics/")

We now take the predicted and real values and estimate the prediction accuracy in different ways.

prediction_analysis_results =  load_from_bucket(
  "tar_vs_untar_prediction_analysis_results.RData",
    "gs://bic_data_analysis/pass1a/metabolomics/",F)
prediction_analysis_results = prediction_analysis_results[[1]]

results_metrics = list()
for(nn in names(prediction_analysis_results)){
  preds = prediction_analysis_results[[nn]]$preds
  real = prediction_analysis_results[[nn]]$real
  tar_name = strsplit(nn,split=";")[[1]][1]
  untar_name = strsplit(nn,split=";")[[1]][2]
  y = metabolomics_processed_datasets[[tar_name]]$normalized_data[[1]]
  colnames(preds) = rownames(y)
  colnames(real) = rownames(y)
  tar_name = simplify_metab_dataset_name(tar_name)
  untar_name = simplify_metab_dataset_name(untar_name)
  currtissue = strsplit(tar_name,split=",")[[1]][1]
  tar_name = gsub(paste(currtissue,",",sep=""),"",tar_name)
  untar_name = gsub(paste(currtissue,",",sep=""),"",untar_name)
  if(! currtissue %in% names(results_metrics)){
    results_metrics[[currtissue]] = list()
  }
  if(! tar_name %in% names(results_metrics[[currtissue]])){
    results_metrics[[currtissue]][[tar_name]] = list()
  }
  
  rhos = format(diag(cor(preds,real,method="spearman")),digits=3)
  rhos = as.numeric(rhos)
  SEs = colSums((preds-real)^2)
  MSEs = SEs / nrow(preds)
  RMSE = sqrt(MSEs)
  rMSE = MSEs / apply(y,1,var)
  CoVs = apply(y,1,sd) / apply(y,1,mean)
  discCoVs = cut(CoVs,breaks = 2,ordered_result = T)
  
  results_metrics[[currtissue]][[tar_name]][[untar_name]] = data.frame(
    rhos,MSEs,RMSE,rMSE,CoVs,discCoVs
  )
}

We now present a few summary plots.

for(tissue in names(results_metrics)){
  for(tar in names(results_metrics[[tissue]])){
    l = results_metrics[[tissue]][[tar]]
    rho_vs_cv = c()
    for(untar in names(l)){
      m = l[[untar]][,c("rhos","discCoVs")] # take the current matrix
      m = cbind(rep(untar,nrow(m)),m)
      m$discCoVs = as.numeric(m$discCoVs)
      rho_vs_cv = rbind(rho_vs_cv,m)
    }
    colnames(rho_vs_cv)[1] = "dataset"
    boxplot(rhos~discCoVs:dataset,data=rho_vs_cv,las=2,
            ylab="Spearman",xlab = "",ylim=c(0,1),
            main = paste(tissue,tar,sep=","))
  }
}

As additional references, we train below additional models. First, we check the prediction of naive models that use technical and clinical covariates only. Second, we use multi-task regression and deep learning models.

cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
  nn1_tissue = strsplit(nn1,split=",")[[1]][1]
  nn1_tissue = gsub("_powder","",nn1_tissue)
  if(grepl("untargeted",nn1)){next}
  print(nn1)
  y = metabolomics_processed_datasets[[nn1]]$normalized_data[[1]]
  y_vials = colnames(y)
  bid_y = merged_dmaqc_data[colnames(y),"bid"]
  colnames(y) = bid_y
  y = t(as.matrix(y))
  if(ncol(y)>1000){next}
  cov_cols = c("animal.registration.sex",
             "acute.test.weight",
             "acute.test.distance",
             "animal.key.timepoint")
  covs = merged_dmaqc_data[y_vials,cov_cols]
  x = covs
  
  # Run the regressions
  folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
  numFeatures = min(ncol(x),2000)
  preds = c();real=c()
  for(i in 1:ncol(y)){
    y_i = y[,1]
    i_preds = c();i_real=c()
    for(j in 1:nfolds){
      print(j)
      tr_x = x[folds!=j,]
      tr_yi = y_i[folds!=j]
      te_x = x[folds==j,]
      te_y = y_i[folds==j]
      # random forest
      model = randomForest(tr_yi,x=tr_x,ntree = 20)
      te_preds = predict(model,newdata = te_x)
      i_preds = c(i_preds,te_preds)
      i_real = c(i_real,te_y)
    }
    preds = cbind(preds,i_preds)
    real = cbind(real,i_real)
  }
  cov_prediction_analysis_results[[nn1]] = list(
      preds = preds,real=real
    )
}

# preds = c();real=c()
# for(j in 1:nfolds){
#   tr_x = x[folds!=j,]
#   tr_y = y[folds!=j,]
#   te_x = x[folds==j,]
#   te_y = y[folds==j,]
#   model = MTL_wrapper(tr_x,tr_y,type="Regression", Regularization="L21")
#   te_preds = predict(model,te_x)
#   real = rbind(real,te_y)
#   preds = rbind(preds,te_preds)
# }
# diag(cor(preds,real))

# Using PLS regression
# library(pls)
# pls_model = plsr(y~x,ncomp = 5,validation="LOO")
# eval = MSEP(pls_model)
# 
# y_pca = prcomp(y)
# plot(y_pca)
# explained_var = y_pca$sdev^2/sum(y_pca$sdev^2)
# y_pca_matrix = y_pca$x[,1:10]
# 
# # regress out sex, weight
# 
# get_explained_variance_using_PCA(x,y)
# x = apply(x,2,regress_out,covs=covs)
# y = apply(y,2,regress_out,covs=covs)
# get_explained_variance_using_PCA(x,y)